#knitr::opts_chunk$set(echo = TRUE)
Based on the fact that Zillow’s housing prediction system doesn’t factor in enough local intellegence, our team aim to build a better predictive model of home prices for San Francisco
In the project, we are interested in general location factors longtitude and latitude . Building factors like the stories, the average house acres, and we think the race distribution could be important, as well. We carefully follow the following steps to build up the linear regression model.
load required packages, set up map theme, read in boundary shapefile
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(AppliedPredictiveModeling)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.2.5
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## ── Conflicts ───────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
library(sf)
## Warning: package 'sf' was built under R version 3.5.2
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(FNN)
## Warning: package 'FNN' was built under R version 3.5.2
library(httr)
## Warning: package 'httr' was built under R version 3.5.2
##
## Attaching package: 'httr'
## The following object is masked from 'package:caret':
##
## progress
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library("corrplot")
library(spdep)
## Warning: package 'spdep' was built under R version 3.5.2
## Loading required package: sp
## Loading required package: spData
## Warning: package 'spData' was built under R version 3.5.2
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
plotTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 18,colour = "black"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=8),
axis.text = element_text(size=8),
axis.title.x = element_text(hjust=1),
axis.title.y = element_text(hjust=1),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"))
}
# And another that we will use for maps
mapTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 18,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
strip.text = element_text(size=12),
axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "grey80", color = "white"),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"))
}
# Define some palettes
palette_9_colors <- c("#0DA3A0","#2999A9","#458FB2","#6285BB","#7E7CC4","#9A72CD","#B768D6","#D35EDF","#F055E9")
palette_8_colors <- c("#0DA3A0","#2D97AA","#4D8CB4","#6E81BF","#8E76C9","#AF6BD4","#CF60DE","#F055E9")
palette_7_colors <- c("#2D97AA","#4D8CB4","#6E81BF","#8E76C9","#AF6BD4","#CF60DE","#F055E9")
palette_1_colors <- c("#0DA3A0")
library(ggmap)
setwd("/Users/chu/Desktop/UPENN/midterm")
SFSales <- read_csv("/Users/chu/Desktop/UPENN/midterm/house_price_indicators.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## ParcelID = col_character(),
## Address = col_character(),
## PropClassC = col_character(),
## LandUse = col_character(),
## ZoningCode = col_character(),
## X = col_double(),
## Y = col_double(),
## long = col_double(),
## lat = col_double(),
## Ave_PropArea = col_double(),
## ave_fam_sz = col_double(),
## ave_hh_sz = col_double(),
## Zmarhh_noch = col_double(),
## med_age = col_double(),
## Zasian = col_double(),
## Zrent = col_double(),
## Zblack = col_double(),
## Zwhite = col_double(),
## Zvacant = col_double(),
## Zfemale = col_double()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 33 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 7207 Ave_PropA… a double #DIV/… '/Users/chu/Desktop/UPENN/midterm/house… file 2 7207 Zmarhh_no… a double #DIV/… '/Users/chu/Desktop/UPENN/midterm/house… row 3 7207 Zasian a double #DIV/… '/Users/chu/Desktop/UPENN/midterm/house… col 4 7207 Zrent a double #DIV/… '/Users/chu/Desktop/UPENN/midterm/house… expected 5 7207 Zblack a double #DIV/… '/Users/chu/Desktop/UPENN/midterm/house…
## ... ................. ... ........................................................................... ........ ........................................................................... ...... ........................................................................... .... ........................................................................... ... ........................................................................... ... ........................................................................... ........ ...........................................................................
## See problems(...) for more details.
SFnhoods <-
st_read("https://data.sfgov.org/api/geospatial/p5b7-5n3h?method=export&format=GeoJSON") %>% st_transform(2227)
## Reading layer `OGRGeoJSON' from data source `https://data.sfgov.org/api/geospatial/p5b7-5n3h?method=export&format=GeoJSON' using driver `GeoJSON'
## Simple feature collection with 41 features and 1 field
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -122.5149 ymin: 37.70813 xmax: -122.357 ymax: 37.8333
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
plot(SFnhoods)
## 1.2 Independent Variable Exploration (with both Rstudio and Arcmap) ### 1.2.1 Race We download race map from ACS dataset, and trying to see if there is relationship between race and house price.
## [1] "b02ab8ff41d238346455dfafa6f85751fe1d5b6c"
## [1] "b02ab8ff41d238346455dfafa6f85751fe1d5b6c"
## Simple feature collection with 6 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -122.4267 ymin: 37.79323 xmax: -122.3897 ymax: 37.81144
## epsg (SRID): 4269
## proj4string: +proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs
## # A tibble: 6 x 6
## GEOID NAME variable value summary_value geometry
## <chr> <chr> <chr> <dbl> <dbl> <MULTIPOLYGON [°]>
## 1 06075… Census … White 1723 3739 (((-122.4206 37.81111, -122…
## 2 06075… Census … White 3358 4143 (((-122.425 37.811, -122.42…
## 3 06075… Census … White 2340 3852 (((-122.4149 37.80354, -122…
## 4 06075… Census … White 2863 4545 (((-122.4129 37.80218, -122…
## 5 06075… Census … White 1655 2685 (((-122.3944 37.80129, -122…
## 6 06075… Census … White 1331 3894 (((-122.4058 37.7999, -122.…
### 1.2.2 Average Family Size
We also download the average family size indicator from ACS dataset
## Simple feature collection with 6 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -122.4267 ymin: 37.79323 xmax: -122.3897 ymax: 37.81144
## epsg (SRID): 4269
## proj4string: +proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs
## GEOID NAME
## 1 06075010100 Census Tract 101, San Francisco County, California
## 2 06075010200 Census Tract 102, San Francisco County, California
## 3 06075010300 Census Tract 103, San Francisco County, California
## 4 06075010400 Census Tract 104, San Francisco County, California
## 5 06075010500 Census Tract 105, San Francisco County, California
## 6 06075010600 Census Tract 106, San Francisco County, California
## variable estimate moe geometry
## 1 B25010_001 1.95 0.13 MULTIPOLYGON (((-122.4206 3...
## 2 B25010_001 1.74 0.15 MULTIPOLYGON (((-122.4267 3...
## 3 B25010_001 2.16 0.15 MULTIPOLYGON (((-122.4187 3...
## 4 B25010_001 1.91 0.13 MULTIPOLYGON (((-122.4149 3...
## 5 B25010_001 1.71 0.16 MULTIPOLYGON (((-122.4052 3...
## 6 B25010_001 1.92 0.14 MULTIPOLYGON (((-122.411 37...
### 1.2.3 Households
We acquire households dataset from ACS as one of our indicators too.
## Simple feature collection with 6 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -122.4267 ymin: 37.79323 xmax: -122.3897 ymax: 37.81144
## epsg (SRID): 4269
## proj4string: +proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs
## GEOID NAME
## 1 06075010100 Census Tract 101, San Francisco County, California
## 2 06075010200 Census Tract 102, San Francisco County, California
## 3 06075010300 Census Tract 103, San Francisco County, California
## 4 06075010400 Census Tract 104, San Francisco County, California
## 5 06075010500 Census Tract 105, San Francisco County, California
## 6 06075010600 Census Tract 106, San Francisco County, California
## variable estimate moe geometry
## 1 B19013_001 81509 19512 MULTIPOLYGON (((-122.4206 3...
## 2 B19013_001 125238 22090 MULTIPOLYGON (((-122.4267 3...
## 3 B19013_001 118210 22286 MULTIPOLYGON (((-122.4187 3...
## 4 B19013_001 106979 21462 MULTIPOLYGON (((-122.4149 3...
## 5 B19013_001 108063 21684 MULTIPOLYGON (((-122.4052 3...
## 6 B19013_001 37000 5428 MULTIPOLYGON (((-122.411 37...
## 1.3 Exploratory Analysis #In this part, we’re going to have a general look at the possible variables and their relationship between others. Firstly, read in the new table with added independent variables.
SFSales <- read_csv("/Users/chu/Desktop/UPENN/midterm/house_price_indicators.csv")
We think house price is a function of
#1 generalLocation
generalLocation<- SFSales %>% select(TARGET_FID,ParcelID ,long,lat)
summary(generalLocation)
## TARGET_FID ParcelID long lat
## Min. : 0 Length:10133 Min. :37.71 Min. :-122.5
## 1st Qu.: 2533 Class :character 1st Qu.:37.73 1st Qu.:-122.5
## Median : 5066 Mode :character Median :37.74 Median :-122.4
## Mean : 5066 Mean :37.75 Mean :-122.4
## 3rd Qu.: 7599 3rd Qu.:37.76 3rd Qu.:-122.4
## Max. :10132 Max. :37.81 Max. :-122.4
kable(summary(generalLocation), format = "markdown", padding = 2)
| TARGET_FID | ParcelID | long | lat | |
|---|---|---|---|---|
| Min. : 0 | Length:10133 | Min. :37.71 | Min. :-122.5 | |
| 1st Qu.: 2533 | Class :character | 1st Qu.:37.73 | 1st Qu.:-122.5 | |
| Median : 5066 | Mode :character | Median :37.74 | Median :-122.4 | |
| Mean : 5066 | NA | Mean :37.75 | Mean :-122.4 | |
| 3rd Qu.: 7599 | NA | 3rd Qu.:37.76 | 3rd Qu.:-122.4 | |
| Max. :10132 | NA | Max. :37.81 | Max. :-122.4 |
censusData<- SFSales %>%
select(pop2000,med_age,Zwhite,Zblack,Zfemale,Zasian,Zmarhh_noch,households,ave_hh_sz,age_65_up) %>%
mutate(pop2000 = as.numeric(pop2000),
households = as.numeric(households))
summary(censusData)
## pop2000 med_age Zwhite Zblack
## Min. : 52 Min. :19.70 Min. :0.0300 Min. :0.00000
## 1st Qu.:1047 1st Qu.:35.70 1st Qu.:0.3200 1st Qu.:0.01000
## Median :1310 Median :38.00 Median :0.5000 Median :0.02000
## Mean :1462 Mean :38.31 Mean :0.4986 Mean :0.07519
## 3rd Qu.:1744 3rd Qu.:40.60 3rd Qu.:0.7000 3rd Qu.:0.07000
## Max. :4649 Max. :75.30 Max. :0.9400 Max. :0.74000
## NA's :5 NA's :5 NA's :5 NA's :5
## Zfemale Zasian Zmarhh_noch households
## Min. :0.1500 Min. :0.0100 Min. :0.0500 Min. : 10.0
## 1st Qu.:0.4900 1st Qu.:0.1200 1st Qu.:0.3700 1st Qu.: 383.0
## Median :0.5100 Median :0.3100 Median :0.4400 Median : 490.5
## Mean :0.4995 Mean :0.3149 Mean :0.4394 Mean : 558.6
## 3rd Qu.:0.5200 3rd Qu.:0.4800 3rd Qu.:0.5000 3rd Qu.: 661.0
## Max. :0.6400 Max. :0.8900 Max. :0.8900 Max. :2099.0
## NA's :5 NA's :5 NA's :6 NA's :5
## ave_hh_sz age_65_up
## Min. :1.290 Min. : 1.0
## 1st Qu.:2.130 1st Qu.:127.0
## Median :2.630 Median :186.0
## Mean :2.705 Mean :203.3
## 3rd Qu.:3.230 3rd Qu.:247.2
## Max. :4.680 Max. :993.0
## NA's :5 NA's :5
kable(summary(censusData), format = "markdown", padding = 2)
| pop2000 | med_age | Zwhite | Zblack | Zfemale | Zasian | Zmarhh_noch | households | ave_hh_sz | age_65_up | |
|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 52 | Min. :19.70 | Min. :0.0300 | Min. :0.00000 | Min. :0.1500 | Min. :0.0100 | Min. :0.0500 | Min. : 10.0 | Min. :1.290 | Min. : 1.0 | |
| 1st Qu.:1047 | 1st Qu.:35.70 | 1st Qu.:0.3200 | 1st Qu.:0.01000 | 1st Qu.:0.4900 | 1st Qu.:0.1200 | 1st Qu.:0.3700 | 1st Qu.: 383.0 | 1st Qu.:2.130 | 1st Qu.:127.0 | |
| Median :1310 | Median :38.00 | Median :0.5000 | Median :0.02000 | Median :0.5100 | Median :0.3100 | Median :0.4400 | Median : 490.5 | Median :2.630 | Median :186.0 | |
| Mean :1462 | Mean :38.31 | Mean :0.4986 | Mean :0.07519 | Mean :0.4995 | Mean :0.3149 | Mean :0.4394 | Mean : 558.6 | Mean :2.705 | Mean :203.3 | |
| 3rd Qu.:1744 | 3rd Qu.:40.60 | 3rd Qu.:0.7000 | 3rd Qu.:0.07000 | 3rd Qu.:0.5200 | 3rd Qu.:0.4800 | 3rd Qu.:0.5000 | 3rd Qu.: 661.0 | 3rd Qu.:3.230 | 3rd Qu.:247.2 | |
| Max. :4649 | Max. :75.30 | Max. :0.9400 | Max. :0.74000 | Max. :0.6400 | Max. :0.8900 | Max. :0.8900 | Max. :2099.0 | Max. :4.680 | Max. :993.0 | |
| NA’s :5 | NA’s :5 | NA’s :5 | NA’s :5 | NA’s :5 | NA’s :5 | NA’s :6 | NA’s :5 | NA’s :5 | NA’s :5 |
#3 BuildingFactor
buildingFactor<- SFSales %>%
select(PropClassC,HouseAge,Stories,Rooms,Beds,Baths)%>%
mutate(PropClassC = as.factor(PropClassC),
Stories= as.factor(Stories))
summary(buildingFactor)
## PropClassC HouseAge Stories Rooms
## D :8676 Min. : 3.00 1 :6235 Min. : 0.000
## Z :1273 1st Qu.: 71.00 2 :2813 1st Qu.: 5.000
## TIC : 58 Median : 90.00 0 : 538 Median : 6.000
## F : 57 Mean : 84.98 3 : 499 Mean : 6.309
## LZ : 24 3rd Qu.:106.00 4 : 41 3rd Qu.: 7.000
## DA : 16 Max. :149.00 6 : 2 Max. :1353.000
## (Other): 29 NA's :97 (Other): 5
## Beds Baths
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 1.000
## Median : 2.000 Median : 2.000
## Mean : 1.693 Mean : 1.794
## 3rd Qu.: 3.000 3rd Qu.: 2.000
## Max. :20.000 Max. :25.000
##
kable(summary(buildingFactor), format = "markdown", padding = 2)
| PropClassC | HouseAge | Stories | Rooms | Beds | Baths | |
|---|---|---|---|---|---|---|
| D :8676 | Min. : 3.00 | 1 :6235 | Min. : 0.000 | Min. : 0.000 | Min. : 0.000 | |
| Z :1273 | 1st Qu.: 71.00 | 2 :2813 | 1st Qu.: 5.000 | 1st Qu.: 0.000 | 1st Qu.: 1.000 | |
| TIC : 58 | Median : 90.00 | 0 : 538 | Median : 6.000 | Median : 2.000 | Median : 2.000 | |
| F : 57 | Mean : 84.98 | 3 : 499 | Mean : 6.309 | Mean : 1.693 | Mean : 1.794 | |
| LZ : 24 | 3rd Qu.:106.00 | 4 : 41 | 3rd Qu.: 7.000 | 3rd Qu.: 3.000 | 3rd Qu.: 2.000 | |
| DA : 16 | Max. :149.00 | 6 : 2 | Max. :1353.000 | Max. :20.000 | Max. :25.000 | |
| (Other): 29 | NA’s :97 | (Other): 5 | NA | NA | NA |
spatial<- SFSales %>%
select(LotArea,PropArea,Zrent,Zvacant)
summary(spatial)
## LotArea PropArea Zrent Zvacant
## Min. : 0 Min. : 187 Min. :0.0600 Min. :0.00000
## 1st Qu.: 187500 1st Qu.: 1158 1st Qu.:0.2500 1st Qu.:0.02000
## Median : 250000 Median : 1492 Median :0.3700 Median :0.03000
## Mean : 246118 Mean : 1648 Mean :0.4064 Mean :0.03267
## 3rd Qu.: 300000 3rd Qu.: 1976 3rd Qu.:0.5600 3rd Qu.:0.04000
## Max. :1890500 Max. :24308 Max. :0.9800 Max. :0.20000
## NA's :73 NA's :5 NA's :5
kable(summary(spatial), format = "markdown", padding = 2)
| LotArea | PropArea | Zrent | Zvacant | |
|---|---|---|---|---|
| Min. : 0 | Min. : 187 | Min. :0.0600 | Min. :0.00000 | |
| 1st Qu.: 187500 | 1st Qu.: 1158 | 1st Qu.:0.2500 | 1st Qu.:0.02000 | |
| Median : 250000 | Median : 1492 | Median :0.3700 | Median :0.03000 | |
| Mean : 246118 | Mean : 1648 | Mean :0.4064 | Mean :0.03267 | |
| 3rd Qu.: 300000 | 3rd Qu.: 1976 | 3rd Qu.:0.5600 | 3rd Qu.:0.04000 | |
| Max. :1890500 | Max. :24308 | Max. :0.9800 | Max. :0.20000 | |
| NA | NA’s :73 | NA’s :5 | NA’s :5 |
Identify numeric variables???display the correlation between variables
SFSales$PropClassC[SFSales$PropClassC == "D"] <- 1
SFSales$PropClassC[SFSales$PropClassC == "Z"] <- 2
SFSales$PropClassC[SFSales$PropClassC == "TIC"] <- 3
SFSales$PropClassC[SFSales$PropClassC == "F"] <- 4
SFSales$PropClassC[SFSales$PropClassC == "LZ"] <- 5
SFSales$PropClassC[SFSales$PropClassC == "DA"] <- 6
SFSales$PropClassC[SFSales$PropClassC == "A"] <- 7
SFSales$PropClassC[SFSales$PropClassC == "ZEU"] <- 8
SFSales$PropClassC[SFSales$PropClassC == "ZBM"] <- 9
SFSales$PropClassC[SFSales$PropClassC == "TH"] <- 10
SFSales$PropClassC[SFSales$PropClassC == "OZ"] <- 11
mydata<- SFSales %>%
select(SalePrice ,
TARGET_FID,
ParcelID ,
PropClassC ,
long ,
lat ,
LotArea ,
PropArea,
HouseAge,
Stories,
Rooms,
Beds,
Baths,
age_65_up,
ave_hh_sz,
households,
Zmarhh_noch,
med_age,
pop2000,
Zasian,
Zrent,
Zblack,
Zwhite,
Zvacant,
Zfemale) %>%
mutate(SalePrice = as.numeric(SalePrice) ,
TARGET_FID = as.numeric(TARGET_FID),
ParcelID = as.numeric(ParcelID),
PropClassC = as.numeric(PropClassC),
long = as.numeric(long),
lat = as.numeric(lat),
LotArea = as.numeric(LotArea),
PropArea = as.numeric(PropArea),
HouseAge = as.numeric(HouseAge),
Stories = as.numeric(Stories),
Rooms = as.numeric(Rooms),
Beds = as.numeric(Beds),
Baths = as.numeric(Baths),
age_65_up = as.numeric(age_65_up),
ave_hh_sz = as.numeric(ave_hh_sz),
households = as.numeric(households),
Zmarhh_noch = as.numeric(Zmarhh_noch),
med_age = as.numeric(med_age),
pop2000 = as.numeric(pop2000),
Zasian = as.numeric(Zasian),
Zrent = as.numeric(Zrent),
Zblack = as.numeric(Zblack),
Zwhite = as.numeric(Zwhite),
Zvacant = as.numeric(Zvacant),
Zfemale = as.numeric(Zfemale))
newdata <- na.omit(mydata)
res <- cor(newdata)
corrplot(res, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,,tl.cex = 0.8)
SFSaleomit <- na.omit(SFSales)
ggplot() +
geom_sf(data=SFnhoods,aes(),fill="black",color=NA)+
geom_point(data = SFSaleomit,
aes(x=SFSaleomit$X, y=SFSaleomit$Y,color=factor(ntile(SFSaleomit$SalePrice,5))),size=0.1)+
scale_colour_manual(values = c("#edf8fb","#b3cde3","#8c96c6","#8856a7","#810f7c"),
labels=as.character(quantile(SFSaleomit$SalePrice,
c(.1,.2,.4,.6,.8),na.rm=T)),
name="Home Price \n(Quantile Breaks)") +
labs(
title = "Home Price Distribution"
)+
mapTheme()
# 2. Modeling Now, we are going to build the OLS model between our dependent price and selected predictors.
The predictors in the model are: SalePrice; TARGET_FID; PropClassC;long; lat; LotArea; PropArea; HouseAge; Stories; Rooms; Beds; Baths; age_65_up; ave_hh_sz; households; Zmarhh_noch; med_age; pop2000; Zasian; Zrent; Zblack; Zvacant; Zwhite; Zfemale
About 12 variables are selected according to the significances shown in the regression model below. And the final model is built with these variables and can be seen in the last part before the prediction of entire dataset.
The goodness of fit will be illlustrated below.
SFallDATA <- read_csv("/Users/chu/Desktop/UPENN/midterm/indicator.csv")
SFselectData <- SFallDATA[,c(3,6,10,11,12,13,14,15,16,17,18,19,20,22,23,24,25,27,29,32,34,35,36,37:42)]
SFselectData$PropClassC[SFSales$PropClassC == "D"] <- 1
SFselectData$PropClassC[SFSales$PropClassC == "Z"] <- 2
SFselectData$PropClassC[SFSales$PropClassC == "TIC"] <- 3
SFselectData$PropClassC[SFSales$PropClassC == "F"] <- 4
SFselectData$PropClassC[SFSales$PropClassC == "LZ"] <- 5
SFselectData$PropClassC[SFSales$PropClassC == "DA"] <- 6
SFselectData$PropClassC[SFSales$PropClassC == "A"] <- 7
SFselectData$PropClassC[SFSales$PropClassC == "ZEU"] <- 8
SFselectData$PropClassC[SFSales$PropClassC == "ZBM"] <- 9
SFselectData$PropClassC[SFSales$PropClassC == "TH"] <- 10
SFselectData$PropClassC[SFSales$PropClassC == "OZ"] <- 11
SFpredict <- subset(SFselectData, SFselectData$SalePrice == 0)
SFmodel <- subset(SFselectData, SFselectData$SalePrice != 0)
SFmodel<-na.omit(SFmodel)
#randomly split dataset
set.seed(153)
require(caTools)
sample = sample.split(SFmodel$SalePrice,SplitRatio = 1/2)
transect1 =subset(SFmodel,sample ==TRUE)
transect2=subset(SFmodel, sample ==FALSE)
library(dplyr)
reg1 <- lm(SalePrice ~ ., data = as.data.frame(SFmodel) %>%
dplyr::select(SalePrice,TARGET_FID, PropClassC,long,lat,LotArea, PropArea,HouseAge,Stories,Rooms,Beds, Baths,age_65_up, ave_hh_sz, households, Zmarhh_noch, med_age, pop2000,Zasian, Zrent,Zblack, Zvacant,Zwhite,Zfemale ))
summary(reg1)
##
## Call:
## lm(formula = SalePrice ~ ., data = as.data.frame(SFmodel) %>%
## dplyr::select(SalePrice, TARGET_FID, PropClassC, long, lat,
## LotArea, PropArea, HouseAge, Stories, Rooms, Beds, Baths,
## age_65_up, ave_hh_sz, households, Zmarhh_noch, med_age,
## pop2000, Zasian, Zrent, Zblack, Zvacant, Zwhite, Zfemale))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6661450 -246062 -28057 197651 2919413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.031e+06 3.328e+07 -0.241 0.809344
## TARGET_FID 9.702e+00 2.273e+00 4.268 1.99e-05 ***
## PropClassCD 1.140e+06 3.064e+05 3.721 0.000199 ***
## PropClassCDA 1.076e+06 3.275e+05 3.286 0.001021 **
## PropClassCF 9.322e+05 3.134e+05 2.974 0.002943 **
## PropClassCLZ 7.182e+05 3.210e+05 2.237 0.025280 *
## PropClassCOZ -8.390e+04 5.322e+05 -0.158 0.874749
## PropClassCTH 9.152e+05 3.283e+05 2.788 0.005322 **
## PropClassCTIC -2.490e+05 3.123e+05 -0.797 0.425267
## PropClassCZ 7.842e+05 3.068e+05 2.556 0.010606 *
## PropClassCZBM 3.991e+05 3.391e+05 1.177 0.239176
## PropClassCZEU 2.995e+06 4.345e+05 6.893 5.82e-12 ***
## long 8.560e+06 4.605e+05 18.588 < 2e-16 ***
## lat 2.600e+06 2.364e+05 11.001 < 2e-16 ***
## LotArea 4.441e-01 4.483e-02 9.906 < 2e-16 ***
## PropArea 2.953e+02 7.992e+00 36.946 < 2e-16 ***
## HouseAge -6.903e+01 2.017e+02 -0.342 0.732209
## Stories 1.478e+02 3.630e+02 0.407 0.683904
## Rooms 2.578e+02 3.205e+02 0.804 0.421191
## Beds -5.818e+03 3.271e+03 -1.779 0.075338 .
## Baths 4.001e+04 6.285e+03 6.367 2.02e-10 ***
## age_65_up -4.899e+02 1.265e+02 -3.872 0.000109 ***
## ave_hh_sz 2.210e+05 2.547e+04 8.676 < 2e-16 ***
## households 1.554e+02 5.818e+01 2.670 0.007591 **
## Zmarhh_noch 1.623e+05 9.465e+04 1.715 0.086407 .
## med_age 5.286e+03 3.009e+03 1.757 0.078981 .
## pop2000 3.187e+00 2.985e+01 0.107 0.914969
## Zasian 4.187e+05 1.044e+05 4.009 6.14e-05 ***
## Zrent 2.402e+05 5.381e+04 4.464 8.14e-06 ***
## Zblack 6.223e+05 1.060e+05 5.870 4.50e-09 ***
## Zvacant 9.033e+05 3.014e+05 2.998 0.002729 **
## Zwhite 2.110e+06 1.267e+05 16.652 < 2e-16 ***
## Zfemale 8.138e+05 1.348e+05 6.038 1.62e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 432400 on 9286 degrees of freedom
## Multiple R-squared: 0.6199, Adjusted R-squared: 0.6186
## F-statistic: 473.4 on 32 and 9286 DF, p-value: < 2.2e-16
According to the summary, the Multiple R-Squared is 0.62, which means 62% of variance in the data could be explained by the model, but r^2 is only one type of ???goodness???.
We are going to use another way to test the model. A good prediction model should be able to produce prodicted value that are highly unified with the observed value collected from the real world. The red line in the below graph plots is produced from real-world data, blue line as our linear regression model. From the points??? distribution, we can observe the predicted and observed values are positive correlated.
regdf<-cbind(SFmodel$SalePrice,reg1$fitted.values)
colnames(regdf)<-c("Observation","Prediction")
regdf<-as.data.frame(regdf)
ggplot()+
geom_point(data=regdf, aes(Observation, Prediction)) +
stat_smooth(data=regdf, aes(Observation, Observation), method = "loess", se = FALSE, size = 1, colour="red") +
stat_smooth(data=regdf, aes(Observation, Prediction), method = "lm", se = FALSE, size = 1, colour="blue") +
labs(title="Predicted Saleprice as a function\nof Observed Saleprice (In-sample)",
subtitle="Perfect prediction in red; Actual prediction in blue") +
theme(plot.title = element_text(size = 18,colour = "black"))
As we can see in the graph above, it’s quite clear to view some outliers in the right side of the plot, which are points of extremely high sale price. ###2.1.3 Residuals One more important “goodness” factor is redisuals. To figure out whether the model is feasible, the prerequisite is to have randomly distributed residuals.
#hist(abs(vars$SalePrice - exp(reg$fitted.values)), breaks=100, main="Histrogram of residuals (absolute values)",xlim = c(0,1000000),ylim = c(0,5000))
hist(abs(SFmodel$SalePrice - (reg1$fitted.values)), breaks=100, main="Histrogram of residuals (absolute values)",
xlim = c(0,4000000),ylim = c(0,2000))
#View(reg$residuals)
#Trying to get the map
regResiduals <-
data.frame(residuals=reg1$residuals,
SalePrice = SFmodel %>% select(SalePrice),
Longitude = SFmodel %>% select(long),
Latitude = SFmodel %>% select(lat),
X = SFmodel %>% select(X),
Y = SFmodel %>% select(Y))
ggplot() +
geom_sf(data=SFnhoods,aes(),fill="black",color=NA)+
geom_point(data = regResiduals,
aes(x=regResiduals$X, y=regResiduals$Y,color=factor(ntile(regResiduals$residuals,5))),size=0.5)+
scale_colour_manual(values = c("#edf8fb","#b3cde3","#8c96c6","#8856a7","#810f7c"),
labels=as.character(quantile(regResiduals$residuals,
c(.1,.2,.4,.6,.8),na.rm=T)),
name="Residuals \n(Quantile Breaks)") +
labs(
title = "Residual of Regression Model",
title = "in-sample prediction"
)+
mapTheme()
## Warning: The plyr::rename operation has created duplicates for the
## following name(s): (`title`)
## 2.2 Out-of-sample Prediction
In order to simulate the out-of-sample usefulness of the model, we seperates the training dataset in in-sample prediction into test set 50%) and training set (50%). New training set is to train the model and the new test set to validate the model. If the model predicts well in the test set, then we could be confident about the generalization capability of the model that we are not over-fitting.
set.seed(153)
require(caTools)
sample = sample.split(SFmodel$SalePrice,SplitRatio = 1/2)
transect1 =subset(SFmodel,sample ==TRUE)
transect2=subset(SFmodel, sample ==FALSE)
#TRAINING
reg2 <- lm(SalePrice ~ ., data = as.data.frame(transect1) %>%
dplyr::select(SalePrice,TARGET_FID,long,lat,LotArea, PropArea, Baths,age_65_up, ave_hh_sz, households, Zvacant,Zwhite,Zfemale ))
summary(reg2)
##
## Call:
## lm(formula = SalePrice ~ ., data = as.data.frame(transect1) %>%
## dplyr::select(SalePrice, TARGET_FID, long, lat, LotArea,
## PropArea, Baths, age_65_up, ave_hh_sz, households, Zvacant,
## Zwhite, Zfemale))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5573846 -260481 -28686 205049 2937784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.205e+07 3.608e+07 -1.997 0.045851 *
## TARGET_FID 1.071e+01 3.144e+00 3.406 0.000663 ***
## long 8.940e+06 5.576e+05 16.033 < 2e-16 ***
## lat 2.178e+06 2.473e+05 8.809 < 2e-16 ***
## LotArea 9.410e-01 5.195e-02 18.116 < 2e-16 ***
## PropArea 2.438e+02 9.841e+00 24.773 < 2e-16 ***
## Baths 5.040e+04 7.235e+03 6.966 3.63e-12 ***
## age_65_up -3.020e+02 8.282e+01 -3.647 0.000268 ***
## ave_hh_sz 1.322e+05 2.357e+04 5.610 2.12e-08 ***
## households 1.036e+02 3.774e+01 2.746 0.006061 **
## Zvacant 1.373e+06 4.235e+05 3.241 0.001199 **
## Zwhite 1.607e+06 5.908e+04 27.199 < 2e-16 ***
## Zfemale 9.714e+05 1.689e+05 5.750 9.40e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 476100 on 5604 degrees of freedom
## Multiple R-squared: 0.5898, Adjusted R-squared: 0.5889
## F-statistic: 671.5 on 12 and 5604 DF, p-value: < 2.2e-16
#TEST
reg3 <- lm(SalePrice ~ ., data = as.data.frame(transect2) %>%
dplyr::select(SalePrice,TARGET_FID,long,lat,LotArea, PropArea, Baths,age_65_up, ave_hh_sz, households, Zvacant,Zwhite,Zfemale ))
summary(reg3)
##
## Call:
## lm(formula = SalePrice ~ ., data = as.data.frame(transect2) %>%
## dplyr::select(SalePrice, TARGET_FID, long, lat, LotArea,
## PropArea, Baths, age_65_up, ave_hh_sz, households, Zvacant,
## Zwhite, Zfemale))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2158349 -229705 -28142 185525 2988297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.583e+06 3.836e+07 -0.146 0.884299
## TARGET_FID 1.061e+01 3.411e+00 3.111 0.001880 **
## long 7.189e+06 6.017e+05 11.947 < 2e-16 ***
## lat 2.179e+06 2.586e+05 8.426 < 2e-16 ***
## LotArea 8.046e-01 5.980e-02 13.454 < 2e-16 ***
## PropArea 3.551e+02 1.497e+01 23.722 < 2e-16 ***
## Baths -6.132e+03 9.864e+03 -0.622 0.534195
## age_65_up -3.744e+02 8.906e+01 -4.204 2.69e-05 ***
## ave_hh_sz 9.338e+04 2.488e+04 3.753 0.000177 ***
## households 1.438e+02 4.178e+01 3.443 0.000583 ***
## Zvacant 1.228e+06 4.427e+05 2.773 0.005577 **
## Zwhite 1.263e+06 6.313e+04 20.000 < 2e-16 ***
## Zfemale 7.972e+05 1.906e+05 4.184 2.93e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 409600 on 3689 degrees of freedom
## Multiple R-squared: 0.5767, Adjusted R-squared: 0.5753
## F-statistic: 418.7 on 12 and 3689 DF, p-value: < 2.2e-16
sanfran.test <-
transect2 %>%
mutate(SalePrice.Predict = predict(reg3, transect2),
SalePrice.Error = SalePrice - SalePrice.Predict,
SalePrice.AbsError = abs(SalePrice - SalePrice.Predict),
SalePrice.APE = (abs(SalePrice - SalePrice.Predict)) / SalePrice) %>%
as.data.frame() %>%
na.exclude()
test_GoodnessFit<-as.data.frame(cbind(summary(reg3)$r.squared,mean(sanfran.test$SalePrice.AbsError),mean(sanfran.test$SalePrice.APE)))
colnames(test_GoodnessFit)<-c("R^2","MAE","MAPE")
kable(test_GoodnessFit,format = "markdown")
| R^2 | MAE | MAPE |
|---|---|---|
| 0.5766547 | 283847.6 | 0.2861876 |
Map of residuals for test dataset
regResiduals2 <-
data.frame(residuals=reg3$residuals,
SalePrice = transect2 %>% select(SalePrice),
Longitude = transect2 %>% select(long),
Latitude = transect2 %>% select(lat),
X = transect2 %>% select(X),
Y = transect2 %>% select(Y)
)
ggplot() +
geom_sf(data=SFnhoods,aes(),fill="black",color=NA)+
geom_point(data = regResiduals2,
aes(x=regResiduals2$X, y=regResiduals2$Y,color=factor(ntile(regResiduals2$residuals,5))),size=0.5)+
scale_colour_manual(values = c("#edf8fb","#b3cde3","#8c96c6","#8856a7","#810f7c"),
labels=as.character(quantile(regResiduals2$residuals,
c(.1,.2,.4,.6,.8),na.rm=T)),
name="Residuals \n(Quantile Breaks)") +
labs(
title = "Residual of Regression Model",
title = "25% randomly selected test set in out-of-sample prediction"
)+
mapTheme()
## Warning: The plyr::rename operation has created duplicates for the
## following name(s): (`title`)
With the 3702 training data, we are able to get an out-of-sample R^2 at 0.53, meaning that we explained 53% of variance in price out-of-sample, which is quite close to our in-sample performace. indicating that our model is not overfitting. The map above shows the out-of-sample prediction residual. We want to check if our prediction is performing equally across space. There seem to be a little clustering. For statistical use, we are running a test below.
coords <- cbind(regResiduals2$X, regResiduals2$Y)
spatialWeights <- knn2nb(knearneigh(coords, 4))
moran.test(regResiduals2$residuals, nb2listw(spatialWeights, style="W"))
##
## Moran I test under randomisation
##
## data: regResiduals2$residuals
## weights: nb2listw(spatialWeights, style = "W")
##
## Moran I statistic standard deviate = 14.224, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.1550596365 -0.0002701972 0.0001192496
The p-values we observed is less then 0.05. It suggests that we can reject the null hypothesis and that our residuals ???exhibit statistically significant clustering???.
There is still possibility for us to we can be get this “not overftting” (62% vs. 58% variance explained in and out of sample) conclusion by chance. In order to reduce the randomness of getting such result, we are running cross validation, that’s doing the traing and test for 100 times.
library(caret)
fitControl <- trainControl(method = "cv", number = 100)
set.seed(140)
lmFit <- train(SalePrice ~ ., data = transect1,
method = "lm",
trControl = fitControl)
lmFit
## Linear Regression
##
## 5617 samples
## 28 predictor
##
## No pre-processing
## Resampling: Cross-Validated (100 fold)
## Summary of sample sizes: 5561, 5561, 5561, 5561, 5561, 5561, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 453420.5 0.7085107 286232.5
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
ggplot(as.data.frame(lmFit$resample), aes(Rsquared)) +
geom_histogram(bins=20) +
labs(x="R^2",
y="Count",
title = "Cross Validation R^2 Distribution"
)+
theme(plot.title = element_text(size = 18,colour = "black"))
This plot shows the R^2 for the 100 trials. The distribution has a nice central tendency towards 0.6 ~ 0.7, meaning that we can confidently conclude that’s our out of sample prediction power, not by odds.
Now that we are confident with our model, we are going to see how it performs on out entire dataset to get the larger picture of Nashville home prices. Apply regression model trained with SFmodel to predict the sale price for the test dataset.
##
## Call:
## lm(formula = SalePrice ~ ., data = as.data.frame(SFmodel) %>%
## dplyr::select(SalePrice, TARGET_FID, long, lat, LotArea,
## PropArea, Baths, age_65_up, ave_hh_sz, households, Zvacant,
## Zwhite, Zfemale))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6199292 -250217 -31523 199257 2980621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.219e+07 2.663e+07 -1.960 0.05 .
## TARGET_FID 1.117e+01 2.340e+00 4.775 1.83e-06 ***
## long 8.396e+06 4.140e+05 20.282 < 2e-16 ***
## lat 2.172e+06 1.813e+05 11.978 < 2e-16 ***
## LotArea 9.126e-01 3.943e-02 23.146 < 2e-16 ***
## PropArea 2.724e+02 8.132e+00 33.503 < 2e-16 ***
## Baths 3.712e+04 5.776e+03 6.427 1.37e-10 ***
## age_65_up -3.385e+02 6.142e+01 -5.512 3.64e-08 ***
## ave_hh_sz 1.193e+05 1.735e+04 6.875 6.59e-12 ***
## households 1.205e+02 2.832e+01 4.255 2.11e-05 ***
## Zvacant 1.339e+06 3.107e+05 4.310 1.65e-05 ***
## Zwhite 1.481e+06 4.368e+04 33.909 < 2e-16 ***
## Zfemale 9.107e+05 1.276e+05 7.134 1.04e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 452700 on 9306 degrees of freedom
## Multiple R-squared: 0.5825, Adjusted R-squared: 0.5819
## F-statistic: 1082 on 12 and 9306 DF, p-value: < 2.2e-16
regdf4<-cbind(SFmodel$SalePrice,reg4$fitted.values)
colnames(regdf4)<-c("Observation","Prediction")
regdf4<-as.data.frame(regdf4)
prediction_result<-as.data.frame(predict(reg4,SFpredict,interval = "prediction"))
summary(prediction_result)
## fit lwr upr
## Min. : 130134 Min. :-757720 Min. :1017988
## 1st Qu.: 726217 1st Qu.:-161692 1st Qu.:1614126
## Median :1127292 Median : 239051 Median :2015533
## Mean :1167814 Mean : 279732 Mean :2055895
## 3rd Qu.:1506757 3rd Qu.: 618608 3rd Qu.:2394906
## Max. :3406810 Max. :2514968 Max. :4298652
## NA's :1 NA's :1 NA's :1
SFpredict$PredSalePrice<-exp(prediction_result$fit)
entirePrediction<-
data.frame(Longitude = c(SFmodel$X,SFpredict$X),
Latitude = c(SFmodel$Y,SFpredict$Y),
prediction_price = c(regdf4$Prediction,SFpredict$PredSalePrice)
)%>%
na.omit()
ggplot() +
geom_sf(data=SFnhoods,aes(),fill="black",color=NA)+
geom_point(data = entirePrediction,
aes(x=Longitude, y=Latitude,color=factor(ntile(entirePrediction$prediction_price,5))),size=0.1)+
scale_colour_manual(values = c("#edf8fb","#b3cde3","#8c96c6","#8856a7","#810f7c"),
labels=as.character(quantile(entirePrediction$prediction_price,
c(.1,.2,.4,.6,.8),na.rm=T)),
name="Homeprice \n(Quantile Breaks)") +
labs(
title = "Home Price Prediction",
subtitle = "Entire Dataset"
)+
mapTheme()
One particular concern in homeprice prediction is the generalizability across space, that we want our model to perform as well in different areas. To check this, we are going to exam mean error by zip code areas in the test data.
compare_result <- read_csv("/Users/chu/Desktop/UPENN/midterm/testmap.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## sanfrantestMAP.X = col_double(),
## sanfrantestMAP.Y = col_double(),
## sanfrantestMAP.long = col_double(),
## sanfrantestMAP.lat = col_double(),
## sanfrantestMAP.Ave_PropArea = col_double(),
## sanfrantestMAP.ave_hh_sz = col_double(),
## sanfrantestMAP.Zmarhh_noch = col_double(),
## sanfrantestMAP.med_age = col_double(),
## sanfrantestMAP.Zasian = col_double(),
## sanfrantestMAP.Zrent = col_double(),
## sanfrantestMAP.Zblack = col_double(),
## sanfrantestMAP.Zwhite = col_double(),
## sanfrantestMAP.Zvacant = col_double(),
## sanfrantestMAP.Zfemale = col_double(),
## sanfrantestMAP.SalePrice_Predict = col_double(),
## sanfrantestMAP.SalePrice_Error = col_double(),
## sanfrantestMAP.SalePrice_AbsError = col_double(),
## sanfrantestMAP.SalePrice_APE = col_double(),
## HousePriceMap.ParcelID = col_character(),
## HousePriceMap.Address = col_character()
## # ... with 10 more columns
## )
## See spec(...) for full column specifications.
test2<-cbind(compare_result,APE=compare_result$sanfrantestMAP.SalePrice_APE)
zipMAPE <-
data.frame(nbrhood=as.factor(compare_result$HousePriceMap.nbrhood),
zipMAPE = as.numeric(compare_result$sanfrantestMAP.SalePrice_APE)
)
zipMAPE<-aggregate(zipMAPE$zipMAPE,by=list(zipMAPE$nbrhood), FUN=mean, na.rm=TRUE)
colnames(zipMAPE) <- c("nbrhood","zipMAPE")
#head(zipMAPE)
zipMap<-read_sf("/Users/chu/Desktop/UPENN/midterm/SF_neighborhoods/SF_neighborhoods.shp")
zipMap<-merge(x = zipMap, y = zipMAPE, by = "nbrhood", all.x = TRUE)
ggplot() +
geom_sf(data=zipMap, aes(fill=zipMAPE), colour="black") +
labs(
title = "Mean Abusolute Percentage Error(MAPE) by Zip Code",
subtitle = "25% test set"
)+
mapTheme()
We do see the difference in goodness of fit across zip code areas. While the overall error rate (MAPE) is 0.29, they varies when broken into zip areas.
Now we know that our model is not performing evenly across zip code areas, to see the pattern of our performance, we scale zip code areas by their average homeprice.
In this plot, we go back to the training dataset
zipMAPE2 <-
data.frame(nbrhood=as.factor(compare_result$HousePriceMap.nbrhood),
zipMAPE = as.numeric(compare_result$sanfrantestMAP.SalePrice_APE),
zipPrice = as.numeric(compare_result$sanfrantestMAP.SalePrice)
)
zipMAPE2_1<-aggregate(c(zipMAPE2$zipMAPE),by=list(zipMAPE2$nbrhood), FUN=mean, na.rm=TRUE)
colnames(zipMAPE2_1) <- c("nbrhood","zipMAPE")
zipMAPE2_2<-aggregate(c(zipMAPE2$zipPrice),by=list(zipMAPE2$nbrhood), FUN=mean, na.rm=TRUE)
colnames(zipMAPE2_2) <- c("nbrhood","zipPrice")
zipMap2<-merge(x = zipMAPE2_1, y = zipMAPE2_2, by = "nbrhood", all = TRUE)
ggplot()+
geom_point(data=zipMap2, aes(zipPrice, zipMAPE)) +
stat_smooth(data=zipMap2, aes(zipPrice, zipMAPE), method = "loess", se = FALSE, size = 1, colour="red") +
labs(title="MAPE by Neighborhood as a Function \nof Mean Price by Neighborhood"
) +
theme(plot.title = element_text(size = 18,colour = "black"))
So, we are not randomly making different errors among neighborhood. The plot shows that our error rate is larger when is homeprice in the area is “not average”. We are performing worse in low price areas.
Our team creates a linear regression model with the data provided by professor, avaible in the Open Data Portal. This is an effective model since it has a R-square of 0.58, which is high. There are about 58% of variation in prices could be predict. The most influential features, in our final model, are the percentages of white and female, the location, number of people aged above 65, average household size, average income per family, vancancy rate, lot area and property area. Of course, other factors like interior characteristics and for example, numbers of bathrooms, and amenity do affect the saleprice, as well. However, because this model is built by using the most updated open-source data, it has time limitations for future using, and users should keep the data updated to get a most precise prediction, and the initial data should also be precise enough for generalizing a prediction model with strong power. I will recommend this model to Zillow, because our model has spatial variation in census such as races. If we consider more spatial data such crime or education, we could build a better model.